### This R script is to make Figure 2 of the teacher working conditions paper

### Steps to make it:
# 1) Get weighted average values for each measure and year
# 2) Do linear interpolation for 2020
# 3) Recenter the weighted average values around 2019
# 5) Plot

# Factor 1 name: Professional Work Environment
# Factor 2 name: Interactions with Students and their Families

# open necessary packages
library(tidyverse)
library(ggrepel)

# set your directory:
setwd("/Users/olgabaker/Desktop/Replication Package Summer 2025/Data")
data_17 <- read.csv("core_17.csv")
data_18 <- read.csv("core_18.csv")
data_19 <- read.csv("core_19.csv")
data_21 <- read.csv("core_21.csv")
data_22 <- read.csv("core_22.csv")
data_23 <- read.csv("core_23.csv")

data <- list(data_17,data_18,data_19,data_21,data_22,data_23) 
names(data) <- paste("data_",c(17:19,21:23),sep="")

# drop individual data objects
rm(data_17)
rm(data_18)
rm(data_19)
rm(data_21)
rm(data_22)
rm(data_23)

## Make names of teacher variables consistent across years. Rename variables if their variable name changed in a given 
# year. Change it to match the name in the "variable name" column of "Teacher Variables.xlsx"
data$data_18 <- data$data_18 %>% rename(cdis1=cdis)
data$data_19 <- data$data_19 %>% rename(cdis1=cdis)

# add factor_1,factor_2,simple_avg,cdis1 and tsaf to varnames
varnames <- c("factor_1","factor_2","simple_avg","cdis1","tsaf")

### 1) Get weighted average values for each measure and year ---------------------------------------------------
### Need to calculate weighted average scores for each teacher variable and year.
### Weight by propensity_score*teacher_n for 2017, 2018 and by teacher_n for 2019-2023

# drop schools without propensity scores
data$data_17 <- filter(data$data_17, !is.na(propensity_score)) # dropping 7 schools
data$data_18 <- filter(data$data_18, !is.na(propensity_score)) # dropping 9 schools 

## Label the weights you'll use for each year as "weight"
data$data_17 <- mutate(data$data_17, weight=propensity_score*teacher_n)
data$data_18 <- mutate(data$data_18, weight=propensity_score*teacher_n)
data$data_19 <- mutate(data$data_19, weight=teacher_n)
data$data_21 <- mutate(data$data_21, weight=teacher_n)
data$data_22 <- mutate(data$data_22, weight=teacher_n)
data$data_23 <- mutate(data$data_23, weight=teacher_n)

## Next write a function that calculates weighted average
weighted_avg <- function(x,var){ # be sure to type the variable name with "" surrounding it
  if(!is.null(x[[var]])){
    return(weighted.mean(x[[var]],x[["weight"]], na.rm=T))
  } else{return(NA)}}

# check work:
factor_1_avg <- sapply(data, function(x) weighted_avg(x, "factor_1"))

## Apply the function to all variables
# put the calculated percentages in a table
weighted_scores <- matrix(NA, length(varnames), length(data))
row.names(weighted_scores) <- varnames
colnames(weighted_scores) <- names(data)

for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  weighted_scores[i,] <- sapply(data, function(x) weighted_avg(x, var))
}

weighted_scores

### 2) Do linear interpolation for 2020  -------------------------------------------------------------------
weighted_scores <- as.data.frame(weighted_scores)
weighted_scores$data_20 <- NA
weighted_scores <- select(weighted_scores, data_17, data_18, data_19, data_20, data_21, data_22, data_23)
weighted_scores$data_20 <- (weighted_scores$data_19 + weighted_scores$data_21)/2

### 3) Do linear interpolation for remaining missing values ---------------------------------------------------
# Formula is: value_2019 + (missing_year - 2019)*(value_2023 - value_2019)/4
var <- which(rownames(weighted_scores)=="cdis1")
weighted_scores$data_20[var] <- weighted_scores$data_19[var] + 
  ((20-19)*(weighted_scores$data_23[var]-weighted_scores$data_19[var]))/(23-19)

weighted_scores$data_21[var] <- weighted_scores$data_19[var] + 
  ((21-19)*(weighted_scores$data_23[var]-weighted_scores$data_19[var]))/(23-19)

weighted_scores$data_22[var] <- weighted_scores$data_19[var] + 
  ((22-19)*(weighted_scores$data_23[var]-weighted_scores$data_19[var]))/(23-19)

weighted_scores

### 4) Recenter the weighted average values around 2019 ---------------------------------------------------
recentered_scores <- apply(weighted_scores,2, function(x) x-weighted_scores$data_19)

## scale by 20
recentered_scores <- recentered_scores/20

### 5) Plot: ---------------------------------------------------

# a) format the data for the plot:
plot_data <- t(rbind(Year=2017:2023, recentered_scores))
rownames(plot_data) <- NULL
plot_data <- as.data.frame(plot_data)

plot_data_backup <- plot_data
plot_data <- plot_data %>% 
  pivot_longer(
    cols = !Year,
    names_to = "survey_variable",
    values_to = "average"
    )


# b) make plot
plot_data %>%
  ggplot(aes(Year, average, group=survey_variable,color=survey_variable))+
  geom_hline(yintercept = 0, linewidth = 0.25)+
  geom_line() + 
  scale_x_continuous(name="Year", breaks = 2017:2023, minor_breaks = NULL)+
  scale_y_continuous(name="Point Value", limits = c(-1.3,0.7))+
  theme_bw() 

# c) grey out cdis1 and tsaf
plot_data %>%
  ggplot(aes(Year, average, group=survey_variable,color=survey_variable))+
  geom_hline(yintercept = 0, linewidth = 0.25)+
  geom_line(data=filter(plot_data, survey_variable%in%c("cdis1","tsaf")), color="gray",
            linewidth=0.25)+
  geom_line(data=filter(plot_data, !(survey_variable%in%c("cdis1","tsaf")))) + 
  scale_x_continuous(name="Year", breaks = 2017:2023, minor_breaks = NULL)+
  scale_y_continuous(name="Point Value", limits = c(-1.3,0.7))+
  theme_bw()
plot_data %>%
  ggplot(aes(Year, average, group=survey_variable,color=survey_variable))+
  geom_hline(yintercept = 0, linewidth = 0.25)+
  geom_line(data=filter(plot_data, survey_variable%in%c("cdis1","tsaf")), color="gray",
            linewidth=0.25)+
  geom_line(data=filter(plot_data, !(survey_variable%in%c("cdis1","tsaf")))) + 
  scale_x_continuous(name="Year", breaks = 2017:2023, minor_breaks = NULL)+
  scale_y_continuous(name="Point Value", limits = c(-1.3,0.7))+
  theme_bw() + 
  scale_color_discrete(name = "Factor", labels = c(
    "Professional Work Environment (Factor 1)", 
    "Interactions with Students and their Families (Factor 2)", 
    "Average across all Working Condition Measures")) 

# d) omit grey lines
plot_data %>%
  ggplot(aes(Year, average, group=survey_variable,color=survey_variable))+
  geom_hline(yintercept = 0, linewidth = 0.25)+
  geom_line(data=filter(plot_data, !(survey_variable%in%c("cdis1","tsaf")))) + 
  scale_x_continuous(name="Year", breaks = 2017:2023, minor_breaks = NULL)+
  scale_y_continuous(name="Point Value", limits = c(-0.6,0.2))+
  theme_bw() + 
  scale_color_discrete(name = "Factor", labels = c(
    "Professional Work Environment (Factor 1)", 
    "Interactions with Students and their Families (Factor 2)", 
    "Average across all Working Condition Measures")) 

# e) differentiate the grey lines
plot_data %>%
  ggplot(aes(Year, average, group=survey_variable,color=survey_variable))+
  geom_hline(yintercept = 0, linewidth = 0.25)+
  geom_line(data=filter(plot_data, survey_variable%in%c("cdis1")), color="gray",
            linewidth=0.25,linetype=1)+
  geom_line(data=filter(plot_data, survey_variable%in%c("tsaf")), color="gray",
            linewidth=0.25,linetype=2)+
  geom_line(data=filter(plot_data, !(survey_variable%in%c("cdis1","tsaf")))) + 
  scale_x_continuous(name="Year", breaks = 2017:2023, minor_breaks = NULL)+
  scale_y_continuous(name="Point Value", limits = c(-1.3,0.7))+
  theme_bw() + 
  scale_color_discrete(name = "Factor", labels = c(
    "Professional Work Environment (Factor 1)", 
    "Interactions with Students and their Families (Factor 2)", 
    "Average across all Working Condition Measures"))

# add classroom disruptions and tsaf to legend
plot_data %>%
  ggplot(aes(Year, average, group=survey_variable,color=survey_variable))+
  geom_hline(yintercept = 0, linewidth = 0.25)+
  geom_line(data=filter(plot_data, survey_variable%in%c("cdis1","tsaf")), color="gray",
            linewidth=0.25,aes(linetype = survey_variable))+
  scale_linetype_discrete(name = "Individual Measures", labels = c(
    "Classroom Disruptions", 
    "Teacher Safety"))+
  geom_line(data=filter(plot_data, !(survey_variable%in%c("cdis1","tsaf")))) + 
  scale_x_continuous(name="Year", breaks = 2017:2023, minor_breaks = NULL)+
  scale_y_continuous(name="Point Value", limits = c(-1.3,0.7))+
  theme_bw() + 
  scale_color_discrete(name = "Aggregate Measures", labels = c(
    "Professional Work Environment (Factor 1)", 
    "Interactions with Students and their Families (Factor 2)", 
    "Average across all Working Condition Measures"))
    #>> exported plot with dimension 1072x621